Importing Libraries

library(ggplot2)
library(tidyverse)
library(dplyr)
library(corrplot)
library(naniar)
library(plotly)
library(data.table)           #for fread()
library(plyr)
library(scales)               #to overrride the default breaks, labels
library(cowplot)
library(lubridate)
library(crosstable)
library(kableExtra)
library(DT)
library(caret)                #for createdatapartition()
library(forcats)
library(TTR)                  #for SMA() 
library(tidyr)
library(jtools)               #for effect_plot()      

1. About COVID dataset

This COVID dataset is obtained from Our World in Data github repository. They continuously update the COVID dataset. This dataset contains 67 variables and 253472 observations but I will be only using few of the variables for the analysis.

covid.data <- fread("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

str(covid.data)
## Classes 'data.table' and 'data.frame':   253473 obs. of  67 variables:
##  $ iso_code                                  : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                                 : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                      : IDate, format: "2020-02-24" "2020-02-25" ...
##  $ total_cases                               : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ new_cases                                 : num  5 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed                        : num  NA NA NA NA NA 0.714 0.714 0 0 0 ...
##  $ total_deaths                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths                                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_cases_per_million                   : num  0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 0.122 ...
##  $ new_cases_per_million                     : num  0.122 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed_per_million            : num  NA NA NA NA NA 0.017 0.017 0 0 0 ...
##  $ total_deaths_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_per_million                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed_per_million           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ reproduction_rate                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                               : chr  "" "" "" "" ...
##  $ total_vaccinations                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters_per_hundred                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed_per_hundred: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                          : num  8.33 8.33 8.33 8.33 8.33 ...
##  $ population_density                        : num  54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                                : num  18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                             : num  2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                             : num  1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                            : num  1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                     : num  597 597 597 597 597 ...
##  $ diabetes_prevalence                       : num  9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities                    : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ hospital_beds_per_thousand                : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                           : num  64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index                   : num  0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
##  $ population                                : num  41128772 41128772 41128772 41128772 41128772 ...
##  $ excess_mortality_cumulative_absolute      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative_per_million   : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>

2. Data Cleaning

2.1 Missing Data?

I used naniar package gg_miss_var() function to visualize the missing data in whole dataframe.

#checking if there are missing data
gg_miss_var(covid.data)

2.2 Treating the missing data

Missing data cannot be neglected. Hence, I replaced the missing data with 0 and mean of respective variables. Since, I won’t use all the variables, I only replace the missing data in variables that I am likely to use in further steps. For the variables of which missing data to be replaced with 0, I replaced the NAs directly without using group_by(location) while for other of which NAs to be replaced with mean, I used group_by(location) first.

covid.data1 <- covid.data %>%
  mutate_at(vars(new_cases, total_cases, new_deaths, total_deaths, tests_per_case, new_tests, reproduction_rate, icu_patients, hosp_patients, people_fully_vaccinated),
            ~replace_na(.,
                        0)) %>%
  group_by(location) %>%
  mutate(human_development_index = replace_na(human_development_index, mean(human_development_index, na.rm = T)),
         population_density = replace_na(population_density, mean(population_density, na.rm = T)),
         gdp_per_capita = replace_na(gdp_per_capita, mean(gdp_per_capita, na.rm = T)),
         extreme_poverty = replace_na(extreme_poverty, mean(extreme_poverty, na.rm = T)),
         male_smokers = replace_na(male_smokers, mean(male_smokers, na.rm = T)),
         female_smokers = replace_na(female_smokers, mean(female_smokers, na.rm = T)),
         icu_patients = replace_na(icu_patients, mean(icu_patients, na.rm = T)),
         hosp_patients = replace_na(hosp_patients, mean(hosp_patients, na.rm = T)),
         handwashing_facilities = replace_na(handwashing_facilities, mean(handwashing_facilities, na.rm = T)),
         median_age = replace_na(median_age, mean(median_age, na.rm = T)),
         stringency_index = replace_na(stringency_index, mean(stringency_index, na.rm = T))) %>%
  ungroup()

2.3 Date column into date format

covid.data1$date <- as.Date(covid.data1$date, format = "%Y-%m-%d")

2.4 Cleaning continent and location column

While looking at the dataset, I found out that the location column has continents also which must be removed.

#Removing continents from country column
continents <- c("Asia", "Africa", "European Union", "Europe", "High income", "Lower middle income", "Low Income", "Upper middle income", 'Oceania', "South America", "North America", "International", "World")
covid <- subset(covid.data1, !(location %in% continents))

#Removing the blank spaces from continent column
covid <- subset(covid, !(continent == ""))

3. Data Manipulation and Visualizations

3.1 Countrywise COVID database

#creating month column
year.month <- covid %>%
  mutate(Month = as.character(lubridate::month(date)),
         Year = lubridate::year(date)) %>%
  select(continent, location, Year, Month, new_cases, new_deaths)

#recoding the month column
year.month$Month <- recode(year.month$Month, 
                      "1" = "January",
                      "2" = "February",
                      "3" = "March",
                      "4" = "April",
                      "5" = "May",
                      "6" = "June", 
                      "7" = "July",
                      "8" = "August",
                      "9" = "September",
                      "10" = "October",
                      "11" = "November",
                      "12" = "December")

covid_summary_statistics <- year.month %>%
  group_by(location, Month, Year) %>%
  dplyr::summarise(Total.Deaths = sum(new_deaths, na.rm = T),
         Total.Cases = sum(new_cases, na.rm = T),
         Mean.cases.per.day = round(mean(new_cases, na.rm =T),2),
         Mean.deaths.per.day = round(mean(new_deaths, na.rm =T),2))

covid_summary_statistics %>%
  select(location, Year, Month, Total.Cases, Total.Deaths, Mean.cases.per.day, Mean.deaths.per.day) %>%
  datatable(
    rownames = F,
    class = "cell-border stripe",
    colnames = c("Country", "Year", "Month", "Total cases", "Total Deaths", "Mean cases per day", "Mean Deaths per day"),
    caption = "Country wise COVID-19 Cases and Deaths",
    options = list(columnDefs = list(list(className = "dt-center", targets = 0:1)))
  )

3.2 Heatmaps

3.2.1 Heatmap of total COVID infections

#latest day
day_latest <- max(covid$date)

#creating heatmaps
covid.cases <- covid %>%
  group_by(location) %>%
  filter(date == max(date))

#creating covid cases heat maps
line <- list(color = toRGB("#d1d1d1"), width = 0.4)
heatmap <- list(
  showframe = F,
  showcoastlines = F,
  projection = list(type = "orthographic"),
  resolution = "100",
  showcountries = T,
  countrycolor = "#d1d1d1",
  showocean = T,
  oceancolor = '#064273',
  showlakes = T,
  lakecolor = '#99c0db',
  showrivers = T,
  rivercolor = '#99c0db',
  bgcolor = '#e8f7fc')

plot_geo() %>%
  layout(geo = heatmap,
         paper_bgcolor = '#e8f7fc',
         title = paste0("World COVID-19 Confirmed Cases till ", day_latest)) %>%
  add_trace(data = covid.cases,
            z = ~total_cases,
            colors = "Reds",
            text = ~location,
            locations = ~iso_code,
            marker = list(line = line))

3.2.2 Heatmap of total COVID deaths

##Heatmap for covid deaths
plot_geo() %>%
  layout(geo = heatmap,
         paper_bgcolor = '#e8f7fc',
         title = paste0("World COVID-19 deaths till ",  day_latest)) %>%
  add_trace(data = covid.cases,
            z = ~total_deaths,
            colors = "Reds",
            text = ~location,
            locations = ~iso_code,
            marker = list(line = line))

3.3 Trend of total COVID cases and deaths

#Trend of world covid cases and deaths
covid %>%
  group_by(date) %>%
  filter(date != day_latest) %>%
  dplyr::summarise(total_deaths = sum(total_deaths, na.rm = T), 
                   total_cases = sum(total_cases, na.rm = T), .groups = "drop") %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = total_cases + 1), color = "#2e9449", linewidth = 1.5) +
  geom_line(aes(y = total_deaths + 1), linewidth = 1.5, linetype = 2, color = "#9c2742") +
  scale_y_continuous(trans = "log10", labels = comma) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  labs(title = "Global COVID infections and deaths",
       subtitle = paste0("Till ", day_latest - 1),
       x = "",
       y = "Log10 transformation") +
  theme_apa() +
  theme(axis.text.x = element_text(angle = 90, color = "black", hjust = 1),
        axis.text = element_text(color = "black")) +
  geom_vline(xintercept = as.Date("2020-03-11"), linetype = "longdash", linewidth = 0.8, col = "black") +
  annotate("text", x = as.Date("2020-03-10"), y = 11100, label = "WHO announces pandemic \n", size = 4.2, angle = 90) +
  geom_vline(xintercept = as.Date("2020-01-30"), linetype = "longdash", linewidth = 0.8, col = "black") +
  annotate("text", x = as.Date("2020-01-20"), y = 16100, label = "Global health emergency declared \n", size = 4.2, angle = 90) +
  annotate("text", x = as.Date("2021-05-05"), y = 1000000, label = "Total Deaths \n", size = 4.2) +
  annotate("text", x = as.Date("2021-05-05"), y = 50000000, label = "Total Cases \n", size = 4.2)

3.4 Trend of new COVID cases and deaths

3.4.1 Trend of new COVID cases

p1 <- covid %>%
  group_by(date, continent) %>%
  dplyr::summarise(new_covid_cases = sum(new_cases, na.rm = T), .groups = "drop") %>%
  ggplot(aes(date)) +
  geom_col(aes(y = new_covid_cases, color = continent)) +
  labs(
    title = "Trend of New COVID-19 cases in different continents",
    subtitle = paste0("Till ", day_latest - 1),
    y = "",
    x = ""
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, hjust = 0.6)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "6 months") +
  facet_wrap(~continent)
p1

3.4.2 Trend of new COVID deaths

p2 <- covid %>%
  group_by(date, continent) %>%
  dplyr::summarise(new_covid_deaths = sum(new_deaths, na.rm = T)) %>% 
  ggplot(aes(date)) +
  geom_col(aes(y = new_covid_deaths, color = continent)) +
  labs(
    title = "Trend of New COVID-19 deaths in different continents",
    subtitle = paste0("Till ", day_latest - 1),
    y = "",
    x = ""
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, hjust = 0.6)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  facet_wrap(~continent)
p2

3.4 COVID cases in different months in different continents

#COVID cases by months
month.df <- year.month %>%
  group_by(Month, continent) %>%
  dplyr::summarise(total.cases = sum(new_cases, na.rm = T),
                   total.deaths = sum(new_deaths, na.rm = T))

month.df$Month <- factor(month.df$Month, levels=c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

p3<-ggplot(month.df, aes(x = Month, y = total.cases, fill = continent)) +
  geom_col(position = "dodge", color = "black") +
  # facet_wrap(~continent) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.2),
        axis.text = element_text(size = 9, color = "black")) +
  labs(
    x = "",
    y = "Total Cases",
    title = "COVID - 19 cases in different continents in different months",
    subtitle = paste0("Till ", day_latest-0)
  ) +
  scale_y_continuous(label = comma) +
  coord_flip()
ggplotly(p3)

3.5 Countries with most COVID cases and deaths

3.5.1 Top Ten countries with most COVID infections

top.10.covid.countries <- covid %>%
  select(location, new_cases, date) %>%
  group_by(location) %>%
  dplyr::summarise(total.cases = sum(new_cases, na.rm = T)) %>%
  top_n(10, total.cases) %>%
  arrange(desc(total.cases)) %>%
  mutate(country.reordered = fct_reorder(location, total.cases))

p4 <- top.10.covid.countries %>%
  ggplot(aes(country.reordered, total.cases)) +
  geom_col() +
  geom_text(aes(label=total.cases), hjust = -0.1, size = 3) +
  scale_y_continuous(label = comma) +
  labs(
    x = "Total Cases",
    y = "",
    title = "Top 10 countries with highest COVID-19 infections",
    subtitle = paste0("Till ", day_latest-1)
  ) +
  theme_bw() +
  coord_flip()
p4

3.5.2 Top Ten countries with most COVID deaths

top.10.covid.deaths.countries <- covid %>%
  select(date, location, new_deaths) %>%
  group_by(location) %>%
  dplyr::summarise(total.deaths = sum(new_deaths, na.rm = T)) %>%
  top_n(10, total.deaths) %>%
  arrange(desc(location)) %>%
  mutate(country.reordered = fct_reorder(location, total.deaths))
p5 <- top.10.covid.deaths.countries %>%
  ggplot(aes(country.reordered, total.deaths)) +
  geom_col() +
  geom_text(aes(label = total.deaths), hjust = -0.1, size = 3) +
  scale_y_continuous(label = comma) +
  labs(x = "",
       y = "Total Deaths",
       title = "Top 10 countries with most COVID-19 deaths",
       subtitle = paste0("Till ", day_latest-1)) +
  theme_bw() +
  coord_flip()
p5

3.6 Bubble Charts

3.6.1 Bubble chart of total COVID cases of different countries

p6 <- covid %>%
  filter(date == max(day_latest - 1)) %>%
  select(total_cases, human_development_index, location, continent) %>%
  mutate(location = factor(location)) %>%
  ggplot(aes(human_development_index, total_cases/1000, size = total_cases, color = continent)) +
  geom_point(aes(text = location), alpha = 0.5) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Human Development Index",
    y = "Total cases in thousands"
  )
   
ggplotly(p6, tooltip = c("text", "total_cases", "human_development_index")) %>%
  layout(title = list(text = paste0('Total Covid cases in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of scatter plot denotes the covid cases','</sup>')),
         margin = list(t = 70))

3.6.2 Bubble chart of total COVID deaths of different countries

p7 <- covid %>%
  filter(date == max(day_latest - 1)) %>%
  select(total_deaths, human_development_index, location, continent) %>%
  mutate(location = factor(location)) %>%
  ggplot(aes(human_development_index, total_deaths/1000, size = total_deaths, color = continent)) +
  geom_point(aes(text = location), alpha = 0.5) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Human Development Index",
    y = "Total deaths in thousands"
  )
 
ggplotly(p7, tooltip = c("text", "total_deaths", "human_development_index")) %>%
  layout(title = list(text = paste0('Total Covid deaths in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of scatter plot denotes the covid deaths','</sup>')),
         margin = list(t = 70))

3.6.2 Bubble chart of COVID infection fatality rate of different countries

p8 <- covid %>%
  filter(date == max(day_latest - 1),
         location != "North Korea") %>%
  select(total_deaths, total_cases, location, continent) %>%
  mutate(location = factor(location),
         infection_fatality_rate = (total_deaths/total_cases)*100) %>%
  ggplot(aes(total_cases/1000, total_deaths/1000, size = infection_fatality_rate, color = continent)) +
  geom_point(aes(text = location), alpha = 0.45) +
  scale_size(range=c(1,10), name = "") +
  theme_bw() +
  labs(
    x = "Total cases in thousands",
    y = "Total deaths in thousands"
  )

ggplotly(p8, tooltip = "all") %>%
  layout(title = list(text = paste0('Total Covid cases vs total deaths in different countries',
                                    '<br>',
                                    '<sup>',
                                    'Size of bubble is based on infection fatality rate','<br>',
                                    'Infection mortality rate vary depending on fairness of reported covid cases and deaths', '</sup>')),
         margin = list(t = 100))

3.7 Correlation Matrix

covid.data.corr <- covid %>%
  select(new_cases, new_deaths, tests_per_case, new_tests, reproduction_rate, icu_patients, hosp_patients, human_development_index, gdp_per_capita, extreme_poverty, male_smokers, female_smokers, handwashing_facilities)

#Plotting the correlation matrix
cor <- cor(covid.data.corr)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(cor, method = 'color', 
         type = "upper", #Displays only upper part of the matrix
         order = "hclust",
         col=col(200),
         addCoef.col = "black",  #Add coeffiecient of correlation
         number.cex = 0.8,
         tl.col="black",  #Text label color
         tl.srt=90,  #Text label rotation
         diag = FALSE,
         sig.level = 0.01, insig = "blank")

4. COVID in Nepal

4.1 Total COVID cases and deaths

covid.nepal <- covid %>%
  filter(location == "Nepal") %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = total_cases + 1), color = "#2e9449", linewidth = 1) +
  geom_line(aes(y = total_deaths + 1), linewidth = 1, linetype = 2, color = "#9c2742") +
  scale_y_continuous(trans = "log10", labels = comma) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  labs(title = "Global COVID infections and deaths",
       subtitle = paste0("Till ", day_latest - 1),
       x = "",
       y = "Log10 transformation") +
  theme_apa() +
  theme(axis.text.x = element_text(angle = 90, color = "black", hjust = 1),
        axis.text = element_text(color = "black")) +
  annotate("text", x = as.Date("2021-05-05"), y = 9000, label = "Total Deaths \n", size = 3.8) +
  annotate("text", x = as.Date("2021-05-05"), y = 450000, label = "Total Cases \n", size = 3.8)  
  
ggplotly(covid.nepal) %>%
  layout(title = list(text = paste0('Global COVID infections and deaths',
                                    '<br>',
                                    '<sup>',
                                    paste0('TIll ',day_latest-1),'</sup>')),
         margin = list(t = 50))

4.2 New COVID cases and deaths

4.2.1 New COVID cases

##data manipulation for trend of new cases and new deaths in nepal
covid.nepal1 <- covid %>%
  filter(location == "Nepal") %>%
  mutate(new.cases.smoothed  = as.integer(SMA(new_cases, n = 14)),
         new.deaths.smoothed = as.integer(SMA(new_deaths, n = 14))) %>%     #as.integer is used to convert numbers with decimals into integers
  select(date, new.cases.smoothed, new.deaths.smoothed) %>%
  filter(date > "2020-02-06")     #To remove rows with NAs induced due to smoothing (14 days simple moving average)

##trend of new covid cases in nepal
p9 <- covid.nepal1 %>%
  ggplot(aes(date, new.cases.smoothed)) +
  geom_line(linewidth = 1, color = "#2C8C33") +
  labs(x = "",
       y = "Number of new cases",
       title = "Trend of New Covid infections in Nepal (14-days smoothed)",
       subtitle = paste0("Till ", day_latest - 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  annotate(geom="text", x=as.Date("2022-03-15"), y=8289, 
           label="Per day new covid cases reached\nmaximum i.e. 8,589", size = 3.5) +
  annotate(geom="point", x=as.Date("2021-05-18"), y=8589, size=6, shape=21, fill="transparent")

p9

4.2.2 New COVID deaths

p10 <- covid.nepal1 %>%
  ggplot(aes(date, new.deaths.smoothed)) +
  geom_line(linewidth = 1, color = "#CF5C5C") +
  labs(x = "",
       y = "Number of new deaths",
       title = "Trend of New Covid related deaths in Nepal (14-days smoothed)",
       subtitle = paste0("Till ", day_latest - 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "3 months") +
  annotate(geom="text", x=as.Date("2022-03-25"), y=185, 
           label="Per day new covid deaths reached\nmaximum i.e. 190", size = 3.5) +
  annotate(geom="point", x=as.Date("2021-05-24"), y=189, size=6, shape=21, fill="transparent")

p10

4.3 Polynomial Regression of new covid cases in Nepal

nepal.df <- covid %>%
  filter(location == "Nepal") %>%
  select(new_tests, new_cases, stringency_index)
plot(nepal.df$new_tests, nepal.df$new_cases)

This plot shows that new_tests and new_cases have strong positive correlation. But it seems new_tests has few outliers which can be removed before fitting a polynomial regression model.

#Identifying the outliers position in new_tests
boxplot(nepal.df$new_tests)

out <- boxplot.stats(nepal.df$new_tests)$out
out_ind <- which(nepal.df$new_tests %in% c(out))
#Removing the outliers
nepal.df1 <- nepal.df[-out_ind,]
#Regression model
model <- lm(new_cases ~ poly(new_tests,2) + stringency_index, data = nepal.df1)
summary(model)
## 
## Call:
## lm(formula = new_cases ~ poly(new_tests, 2) + stringency_index, 
##     data = nepal.df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5660.1  -291.2   -18.1   195.1  7897.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           303.967     64.883   4.685 3.15e-06 ***
## poly(new_tests, 2)1 35187.011    995.736  35.338  < 2e-16 ***
## poly(new_tests, 2)2 24672.382    890.664  27.701  < 2e-16 ***
## stringency_index       11.814      1.176  10.050  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 876.7 on 1094 degrees of freedom
## Multiple R-squared:  0.724,  Adjusted R-squared:  0.7232 
## F-statistic: 956.5 on 3 and 1094 DF,  p-value: < 2.2e-16
  • Both independent variables new_tests and stringency_index have significant impact on new_cases at all level of significance.

  • The polynomial regression model is \(Y = 303.967 + 35187.011X_1 + 24672.382{X_1}^2 + 11.814X_2 + Error\).

    Here \(X_1\) is new_tests and \(X_2\) is stringency index.

  • Adjusted R-squared is 72.3% indicating that these two predictor variables result in 72.3% variance in the dependent variable i.e., new_cases and rest 27.7% by other factors not in this regression model.

residplot(model)

effect_plot(model, pred = new_tests, interval = T, plot.points = T, line.colors = "#4669E8")